[Proposed] df50_tr,df005_tst 코드보기

Author

김보람

Published

February 15, 2024

imports

import pandas as pd
import numpy as np
import sklearn
import pickle 
import time 
import datetime
import warnings
warnings.filterwarnings('ignore')
%run ../function_proposed_gcn.py
with open('../fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    
fraudTrain.is_fraud.mean()
0.005727773406766326
df50 = throw(fraudTrain,0.5)
df_tr, df_tst = sklearn.model_selection.train_test_split(df50)

dfn = fraudTrain[::10]
dfn = dfn.reset_index(drop=True)
df_trn, df_tstn = sklearn.model_selection.train_test_split(dfn)
df_tr.shape,df_tstn.shape
((9009, 22), (26215, 22))
df2, mask = concat(df_tr, df_tstn)
df2['index'] = df2.index
df = df2.reset_index()
df.is_fraud.mean(), df_tr.is_fraud.mean(), df_tstn.is_fraud.mean()
(0.1329775153304565, 0.5034965034965035, 0.005645622735075339)
groups = df.groupby('cc_num')
# edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
# edge_index = edge_index.astype(np.float64)
# np.save('edge_index_0215.npy', edge_index)
edge_index = np.load('edge_index_0215.npy')
edge_index[:,2].mean()
11424483.3240161
plt.hist(edge_index[:,2])
(array([396764., 293276., 254602., 221676., 185488., 142054., 107284.,
         72978.,  32646.,  10480.]),
 array([       0.,  3754074.,  7508148., 11262222., 15016296., 18770370.,
        22524444., 26278518., 30032592., 33786666., 37540740.]),
 <BarContainer object of 10 artists>)

time의 차이니까…… 값이 작을수록, 거래시간이 가까운것.

theta = edge_index[:,2].mean()
plt.hist(-edge_index[:,2]/1e7)
(array([ 10480.,  32646.,  72978., 107284., 142054., 185488., 221676.,
        254602., 293276., 396764.]),
 array([-3.754074 , -3.3786666, -3.0032592, -2.6278518, -2.2524444,
        -1.877037 , -1.5016296, -1.1262222, -0.7508148, -0.3754074,
        -0.       ]),
 <BarContainer object of 10 artists>)

plt.hist(-edge_index[:,2]/theta)
(array([ 10480.,  32646.,  72978., 107284., 142054., 185488., 221676.,
        254602., 293276., 396764.]),
 array([-3.28599018, -2.95739116, -2.62879214, -2.30019313, -1.97159411,
        -1.64299509, -1.31439607, -0.98579705, -0.65719804, -0.32859902,
        -0.        ]),
 <BarContainer object of 10 artists>)

-3.647298e-01
-0.3647298
  • 마이너스 하니까 값이 클수록 거래시간이 가까움.
np.percentile(-edge_index[:,2]/theta,75), np.percentile(-edge_index[:,2]/theta,80), np.percentile(-edge_index[:,2]/theta,85), np.percentile(-edge_index[:,2]/theta,90)
(-0.36472984220131965,
 -0.27277732494466095,
 -0.18372121876074105,
 -0.09886661549285204)
(pd.DataFrame(-edge_index[:,2]/1e7)).describe()
0
count 1.717248e+06
mean -1.142448e+00
std 8.596337e-01
min -3.754074e+00
25% -1.739953e+00
50% -9.940050e-01
75% -4.166850e-01
max -0.000000e+00
(pd.DataFrame(-edge_index[:,2]/theta)).describe()
0
count 1.717248e+06
mean -1.000000e+00
std 7.524487e-01
min -3.285990e+00
25% -1.523004e+00
50% -8.700656e-01
75% -3.647298e-01
max -0.000000e+00
plt.hist((np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist())
(array([151332., 266952., 239110., 201528., 174204., 157072., 141056.,
        125410., 117738., 142846.]),
 array([0.        , 0.09999947, 0.19999895, 0.29999842, 0.3999979 ,
        0.49999737, 0.59999685, 0.69999632, 0.7999958 , 0.89999527,
        0.99999475]),
 <BarContainer object of 10 artists>)

pd.DataFrame((np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()).describe()
0
count 1.717248e+06
mean 4.457620e-01
std 2.803088e-01
min 0.000000e+00
25% 2.042338e-01
50% 3.998302e-01
75% 6.679522e-01
max 9.999947e-01
edge_index[:,2]  = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > 0.8], dtype=torch.long).t()
x = torch.tensor(df['amt'].values, dtype=torch.float).reshape(-1,1)
y = torch.tensor(df['is_fraud'].values,dtype=torch.int64)
data = torch_geometric.data.Data(x=x, edge_index = edge_index, y=y, train_mask = mask[0], test_mask= mask[1])
data
Data(x=[35224, 1], edge_index=[2, 260578], y=[35224], train_mask=[35224], test_mask=[35224])
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
yyhat, yyhat_ = train_and_evaluate_model(data, model, optimizer)
yyhat_ = yyhat_.detach().numpy()
eval = evaluation(yy, yyhat, yyhat_)
eval
{'acc': 0.8473393095555979,
 'pre': 0.018009004502251125,
 'rec': 0.4864864864864865,
 'f1': 0.03473227206946455,
 'auc': 0.7499672102762216}

?!?!?